{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 处理活动数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "导入必要的包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import scipy.sparse as ss\n",
    "import scipy.io as sio\n",
    "\n",
    "#保存数据\n",
    "import _pickle\n",
    "\n",
    "from sklearn.preprocessing import normalize\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#特征编码\n",
    "import datetime\n",
    "import hashlib\n",
    "import locale\n",
    "\n",
    "from collections import defaultdict\n",
    "from sklearn.preprocessing import normalize\n",
    "\n",
    "class FeatureEng:\n",
    "    def __init__(self):\n",
    "         # 载入 locales\n",
    "        self.localeIdMap = defaultdict(int)\n",
    "        for i, l in enumerate(locale.locale_alias.keys()):\n",
    "          self.localeIdMap[l] = i + 1\n",
    "        \n",
    "        # 载入 gender id 字典\n",
    "        ##缺失补0\n",
    "        self.genderIdMap = defaultdict(int, {'NaN': 0, \"male\":1, \"female\":2})\n",
    "\n",
    "  \n",
    "    def getLocaleId(self, locstr):\n",
    "        return self.localeIdMap[locstr.lower()]\n",
    "\n",
    "    def getGenderId(self, genderStr):\n",
    "        return self.genderIdMap[genderStr]\n",
    "\n",
    "    def getJoinedYearMonth(self, dateString):\n",
    "        try:\n",
    "            dttm = datetime.datetime.strptime(dateString, \"%Y-%m-%dT%H:%M:%S.%fZ\")\n",
    "            #return \"\".join([str(dttm.year), str(dttm.month)])\n",
    "            return (dttm.year-2010)*12 + dttm.month\n",
    "        except:  #缺失补0\n",
    "          return 0\n",
    "\n",
    "    def getBirthYearInt(self, birthYear):\n",
    "        #缺失补0\n",
    "        try:\n",
    "          return 0 if birthYear == \"None\" else int(birthYear)\n",
    "        except:\n",
    "          return 0\n",
    "\n",
    "    def getTimezoneInt(self, timezone):\n",
    "        try:\n",
    "          return int(timezone)\n",
    "        except:  #缺失值处理\n",
    "          return 0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "计算活动数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of records :3137972\n"
     ]
    }
   ],
   "source": [
    "lines = 0\n",
    "fin = open(\"./data/events.csv\", 'r')\n",
    "fin.readline()\n",
    "for line in fin:\n",
    "    cols = line.strip().split(\",\")\n",
    "    lines += 1\n",
    "fin.close()\n",
    "\n",
    "print(\"number of records :%d\" % lines)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "读取训练和测试的活动"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of events in train & test :13418\n"
     ]
    }
   ],
   "source": [
    "eventIndex = _pickle.load(open(\"./data/PE_eventIndex.pkl\", 'rb'))\n",
    "n_events = len(eventIndex)\n",
    "\n",
    "print(\"number of events in train & test :%d\" % n_events)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'1566969892': 0,\n",
       " '199908595': 1,\n",
       " '428794879': 2,\n",
       " '1270937556': 3,\n",
       " '1921275': 4,\n",
       " '52214972': 5,\n",
       " '3438779574': 6,\n",
       " '4231864079': 7,\n",
       " '3161734222': 8,\n",
       " '1968146304': 9,\n",
       " '3877409694': 10,\n",
       " '2695866169': 11,\n",
       " '3512026987': 12,\n",
       " '4040181802': 13,\n",
       " '2219009935': 14,\n",
       " '4186433927': 15,\n",
       " '4161974839': 16,\n",
       " '2390991047': 17,\n",
       " '3901745148': 18,\n",
       " '1217700763': 19,\n",
       " '2355400133': 20,\n",
       " '1619013260': 21,\n",
       " '127371840': 22,\n",
       " '1276702165': 23,\n",
       " '8816449': 24,\n",
       " '1967012522': 25,\n",
       " '1298016856': 26,\n",
       " '4294677286': 27,\n",
       " '1949386711': 28,\n",
       " '1877217637': 29,\n",
       " '4085563097': 30,\n",
       " '2710165417': 31,\n",
       " '3522246891': 32,\n",
       " '2701666228': 33,\n",
       " '2843363655': 34,\n",
       " '1384103878': 35,\n",
       " '2002146936': 36,\n",
       " '894829625': 37,\n",
       " '177094411': 38,\n",
       " '3865569130': 39,\n",
       " '4112286860': 40,\n",
       " '508297104': 41,\n",
       " '3505158996': 42,\n",
       " '3007412705': 43,\n",
       " '539032065': 44,\n",
       " '2600467156': 45,\n",
       " '3277640596': 46,\n",
       " '2441556923': 47,\n",
       " '362071615': 48,\n",
       " '3188189145': 49,\n",
       " '2178346412': 50,\n",
       " '3462134235': 51,\n",
       " '2412421425': 52,\n",
       " '2810314125': 53,\n",
       " '3199229320': 54,\n",
       " '2493891004': 55,\n",
       " '3287069160': 56,\n",
       " '1293469319': 57,\n",
       " '866512763': 58,\n",
       " '2492936252': 59,\n",
       " '1519693555': 60,\n",
       " '1749797086': 61,\n",
       " '1224903972': 62,\n",
       " '3291155312': 63,\n",
       " '3654419574': 64,\n",
       " '4165471135': 65,\n",
       " '1440328560': 66,\n",
       " '4256163933': 67,\n",
       " '1538046875': 68,\n",
       " '3426952204': 69,\n",
       " '2627346430': 70,\n",
       " '1682999037': 71,\n",
       " '3906291725': 72,\n",
       " '671695205': 73,\n",
       " '1552700632': 74,\n",
       " '2997875645': 75,\n",
       " '640184889': 76,\n",
       " '3731803757': 77,\n",
       " '1944497323': 78,\n",
       " '956750026': 79,\n",
       " '1872758613': 80,\n",
       " '1142967652': 81,\n",
       " '2384544175': 82,\n",
       " '3506737502': 83,\n",
       " '2718812266': 84,\n",
       " '3863047854': 85,\n",
       " '2583376810': 86,\n",
       " '1548902066': 87,\n",
       " '229928069': 88,\n",
       " '3132828624': 89,\n",
       " '2980178978': 90,\n",
       " '4070309332': 91,\n",
       " '85254966': 92,\n",
       " '510790020': 93,\n",
       " '551895988': 94,\n",
       " '612824362': 95,\n",
       " '3327257991': 96,\n",
       " '2575303578': 97,\n",
       " '2157645470': 98,\n",
       " '408147094': 99,\n",
       " '2016892878': 100,\n",
       " '2605913428': 101,\n",
       " '1014040686': 102,\n",
       " '976975428': 103,\n",
       " '3980841936': 104,\n",
       " '1587116147': 105,\n",
       " '3439842241': 106,\n",
       " '939765972': 107,\n",
       " '1010938434': 108,\n",
       " '1321838948': 109,\n",
       " '849777211': 110,\n",
       " '3221372597': 111,\n",
       " '2046133166': 112,\n",
       " '3633351483': 113,\n",
       " '1865576144': 114,\n",
       " '360093292': 115,\n",
       " '3213836603': 116,\n",
       " '3516991107': 117,\n",
       " '3553950281': 118,\n",
       " '1249778557': 119,\n",
       " '2660643006': 120,\n",
       " '762391043': 121,\n",
       " '2339593321': 122,\n",
       " '1257844994': 123,\n",
       " '2587616435': 124,\n",
       " '402856544': 125,\n",
       " '91909487': 126,\n",
       " '3505087357': 127,\n",
       " '2355645674': 128,\n",
       " '3980763324': 129,\n",
       " '736179869': 130,\n",
       " '2011074541': 131,\n",
       " '3437453244': 132,\n",
       " '2546783441': 133,\n",
       " '2042878904': 134,\n",
       " '2814838138': 135,\n",
       " '1975008770': 136,\n",
       " '4181789141': 137,\n",
       " '2011450028': 138,\n",
       " '2746302339': 139,\n",
       " '19341280': 140,\n",
       " '3949033351': 141,\n",
       " '3364101550': 142,\n",
       " '1369337496': 143,\n",
       " '4236404112': 144,\n",
       " '3658267924': 145,\n",
       " '1970721704': 146,\n",
       " '1427467626': 147,\n",
       " '2676810251': 148,\n",
       " '3611536448': 149,\n",
       " '255609235': 150,\n",
       " '2991901172': 151,\n",
       " '2274349979': 152,\n",
       " '4020625664': 153,\n",
       " '4074887767': 154,\n",
       " '1578063348': 155,\n",
       " '2345215183': 156,\n",
       " '2994578769': 157,\n",
       " '3050501263': 158,\n",
       " '4222772842': 159,\n",
       " '2533905418': 160,\n",
       " '2507282360': 161,\n",
       " '2730018676': 162,\n",
       " '760605611': 163,\n",
       " '253143325': 164,\n",
       " '2886180176': 165,\n",
       " '1619986209': 166,\n",
       " '3824757637': 167,\n",
       " '1003847016': 168,\n",
       " '3165653791': 169,\n",
       " '1356178708': 170,\n",
       " '1736093461': 171,\n",
       " '4201630080': 172,\n",
       " '1136599794': 173,\n",
       " '3293166549': 174,\n",
       " '3991693053': 175,\n",
       " '1555104795': 176,\n",
       " '4070118191': 177,\n",
       " '2374444123': 178,\n",
       " '913255694': 179,\n",
       " '4231641706': 180,\n",
       " '2262878747': 181,\n",
       " '274918146': 182,\n",
       " '3454922346': 183,\n",
       " '3302882916': 184,\n",
       " '2904863529': 185,\n",
       " '1697930615': 186,\n",
       " '990120316': 187,\n",
       " '2082259770': 188,\n",
       " '2005419859': 189,\n",
       " '3072990888': 190,\n",
       " '2818648337': 191,\n",
       " '2911627667': 192,\n",
       " '2551484859': 193,\n",
       " '2142921098': 194,\n",
       " '1722968298': 195,\n",
       " '3347784034': 196,\n",
       " '2658555390': 197,\n",
       " '3012567312': 198,\n",
       " '3987699038': 199,\n",
       " '263011441': 200,\n",
       " '1151885141': 201,\n",
       " '2610494844': 202,\n",
       " '25999604': 203,\n",
       " '2955724234': 204,\n",
       " '1498254766': 205,\n",
       " '3561137432': 206,\n",
       " '2683913641': 207,\n",
       " '702971420': 208,\n",
       " '622120837': 209,\n",
       " '1592319948': 210,\n",
       " '1374817425': 211,\n",
       " '1383584914': 212,\n",
       " '2924356031': 213,\n",
       " '2288884065': 214,\n",
       " '4126236405': 215,\n",
       " '1845786756': 216,\n",
       " '1439192677': 217,\n",
       " '3779103948': 218,\n",
       " '1708950739': 219,\n",
       " '116674349': 220,\n",
       " '1436872418': 221,\n",
       " '3502847831': 222,\n",
       " '2530285484': 223,\n",
       " '620931168': 224,\n",
       " '2739447505': 225,\n",
       " '3539428285': 226,\n",
       " '615258122': 227,\n",
       " '2064060869': 228,\n",
       " '1829771929': 229,\n",
       " '2945230548': 230,\n",
       " '354333473': 231,\n",
       " '1278050580': 232,\n",
       " '2777472459': 233,\n",
       " '3475131654': 234,\n",
       " '3376956416': 235,\n",
       " '2795929444': 236,\n",
       " '2285107026': 237,\n",
       " '1047008495': 238,\n",
       " '2178551581': 239,\n",
       " '2863707763': 240,\n",
       " '3397274143': 241,\n",
       " '3943014604': 242,\n",
       " '3475980760': 243,\n",
       " '592400339': 244,\n",
       " '1212611096': 245,\n",
       " '1598382429': 246,\n",
       " '1008380446': 247,\n",
       " '3288650677': 248,\n",
       " '2860652622': 249,\n",
       " '189449283': 250,\n",
       " '4270106183': 251,\n",
       " '675216211': 252,\n",
       " '3448037031': 253,\n",
       " '1997109747': 254,\n",
       " '4101475674': 255,\n",
       " '3870329460': 256,\n",
       " '780696608': 257,\n",
       " '1200696517': 258,\n",
       " '1699317246': 259,\n",
       " '2133183406': 260,\n",
       " '2375496319': 261,\n",
       " '1449747272': 262,\n",
       " '1302697811': 263,\n",
       " '3293018394': 264,\n",
       " '4162137499': 265,\n",
       " '2616716533': 266,\n",
       " '2006801563': 267,\n",
       " '3224990733': 268,\n",
       " '3971390766': 269,\n",
       " '2642084542': 270,\n",
       " '3225161223': 271,\n",
       " '1027412797': 272,\n",
       " '3212363005': 273,\n",
       " '2758523549': 274,\n",
       " '96823232': 275,\n",
       " '3010686555': 276,\n",
       " '2693186870': 277,\n",
       " '224130184': 278,\n",
       " '454129950': 279,\n",
       " '3741020356': 280,\n",
       " '528216257': 281,\n",
       " '460864556': 282,\n",
       " '3731778741': 283,\n",
       " '1868937145': 284,\n",
       " '487592799': 285,\n",
       " '2427346312': 286,\n",
       " '73512801': 287,\n",
       " '1979059717': 288,\n",
       " '3765117189': 289,\n",
       " '3677230481': 290,\n",
       " '1619676916': 291,\n",
       " '3809622055': 292,\n",
       " '3108893200': 293,\n",
       " '1684746528': 294,\n",
       " '1517488130': 295,\n",
       " '3129233779': 296,\n",
       " '739312213': 297,\n",
       " '1558108450': 298,\n",
       " '2661235972': 299,\n",
       " '855637924': 300,\n",
       " '2096658932': 301,\n",
       " '3595722243': 302,\n",
       " '1065265666': 303,\n",
       " '1446431799': 304,\n",
       " '757304272': 305,\n",
       " '2496162191': 306,\n",
       " '163992226': 307,\n",
       " '1665286634': 308,\n",
       " '721524089': 309,\n",
       " '932459990': 310,\n",
       " '1615468226': 311,\n",
       " '1300982397': 312,\n",
       " '1184535341': 313,\n",
       " '982365060': 314,\n",
       " '1188520854': 315,\n",
       " '4055585686': 316,\n",
       " '1153175316': 317,\n",
       " '1278199678': 318,\n",
       " '1497621048': 319,\n",
       " '1031579616': 320,\n",
       " '677485601': 321,\n",
       " '1575588609': 322,\n",
       " '1282190480': 323,\n",
       " '4095072803': 324,\n",
       " '3994420727': 325,\n",
       " '3732079142': 326,\n",
       " '2413506174': 327,\n",
       " '3528953266': 328,\n",
       " '4142448875': 329,\n",
       " '2251550828': 330,\n",
       " '2563575913': 331,\n",
       " '385934286': 332,\n",
       " '1442371155': 333,\n",
       " '1985153611': 334,\n",
       " '1476967002': 335,\n",
       " '549024328': 336,\n",
       " '4114296702': 337,\n",
       " '167252762': 338,\n",
       " '970316639': 339,\n",
       " '3508758003': 340,\n",
       " '62454085': 341,\n",
       " '430406405': 342,\n",
       " '1426073917': 343,\n",
       " '840148457': 344,\n",
       " '681990144': 345,\n",
       " '3685139016': 346,\n",
       " '643979828': 347,\n",
       " '2130616732': 348,\n",
       " '4001306055': 349,\n",
       " '1177495191': 350,\n",
       " '3030832826': 351,\n",
       " '1080261006': 352,\n",
       " '516999313': 353,\n",
       " '2610444710': 354,\n",
       " '1649465591': 355,\n",
       " '4230784307': 356,\n",
       " '1868735086': 357,\n",
       " '662862265': 358,\n",
       " '1500808029': 359,\n",
       " '2847152365': 360,\n",
       " '2867445361': 361,\n",
       " '3068622773': 362,\n",
       " '879735388': 363,\n",
       " '1877260733': 364,\n",
       " '2468147494': 365,\n",
       " '1077901228': 366,\n",
       " '2178047697': 367,\n",
       " '1561841388': 368,\n",
       " '1678019857': 369,\n",
       " '867953287': 370,\n",
       " '2370435618': 371,\n",
       " '3335153331': 372,\n",
       " '4213945358': 373,\n",
       " '3083830288': 374,\n",
       " '2364839174': 375,\n",
       " '1476508560': 376,\n",
       " '2814323926': 377,\n",
       " '3532568266': 378,\n",
       " '573886273': 379,\n",
       " '73873746': 380,\n",
       " '2214433187': 381,\n",
       " '914456218': 382,\n",
       " '1127671037': 383,\n",
       " '1911033753': 384,\n",
       " '2381736875': 385,\n",
       " '1061559786': 386,\n",
       " '3772142360': 387,\n",
       " '2681491444': 388,\n",
       " '1822642771': 389,\n",
       " '1603457279': 390,\n",
       " '4207915079': 391,\n",
       " '3693396474': 392,\n",
       " '3712791641': 393,\n",
       " '3339150698': 394,\n",
       " '1238006360': 395,\n",
       " '3584210334': 396,\n",
       " '902747271': 397,\n",
       " '3582327628': 398,\n",
       " '803377467': 399,\n",
       " '4126704525': 400,\n",
       " '3963798906': 401,\n",
       " '339060886': 402,\n",
       " '4104024410': 403,\n",
       " '2419911791': 404,\n",
       " '2301723344': 405,\n",
       " '1827283730': 406,\n",
       " '2050034443': 407,\n",
       " '19969040': 408,\n",
       " '3281130769': 409,\n",
       " '3203638855': 410,\n",
       " '4058800014': 411,\n",
       " '829602148': 412,\n",
       " '3584264094': 413,\n",
       " '2012232184': 414,\n",
       " '3976610702': 415,\n",
       " '2083799098': 416,\n",
       " '3047117217': 417,\n",
       " '2928235619': 418,\n",
       " '1685190005': 419,\n",
       " '1535364758': 420,\n",
       " '4002702641': 421,\n",
       " '3905269676': 422,\n",
       " '647011225': 423,\n",
       " '2092874032': 424,\n",
       " '1838843061': 425,\n",
       " '627290679': 426,\n",
       " '3327557579': 427,\n",
       " '1586611723': 428,\n",
       " '3951904788': 429,\n",
       " '2237079226': 430,\n",
       " '610367576': 431,\n",
       " '921197600': 432,\n",
       " '354110604': 433,\n",
       " '559455545': 434,\n",
       " '3460711091': 435,\n",
       " '2287755049': 436,\n",
       " '2509086570': 437,\n",
       " '514005507': 438,\n",
       " '1074417398': 439,\n",
       " '3580637647': 440,\n",
       " '3708050271': 441,\n",
       " '2063045108': 442,\n",
       " '518779164': 443,\n",
       " '2736362265': 444,\n",
       " '3365253328': 445,\n",
       " '2723447665': 446,\n",
       " '364913652': 447,\n",
       " '2929324744': 448,\n",
       " '321681951': 449,\n",
       " '2372401889': 450,\n",
       " '2711079268': 451,\n",
       " '479066963': 452,\n",
       " '2066686477': 453,\n",
       " '365372360': 454,\n",
       " '2904491741': 455,\n",
       " '2766764102': 456,\n",
       " '2687062464': 457,\n",
       " '473531767': 458,\n",
       " '2540229116': 459,\n",
       " '3855990127': 460,\n",
       " '2725906388': 461,\n",
       " '1200746292': 462,\n",
       " '3466325304': 463,\n",
       " '2489566262': 464,\n",
       " '3946614613': 465,\n",
       " '3008741424': 466,\n",
       " '4159177884': 467,\n",
       " '1396631048': 468,\n",
       " '776270146': 469,\n",
       " '1265003645': 470,\n",
       " '856850297': 471,\n",
       " '4199166902': 472,\n",
       " '1968459057': 473,\n",
       " '1482347455': 474,\n",
       " '3251036208': 475,\n",
       " '1730501944': 476,\n",
       " '3392885130': 477,\n",
       " '1356764145': 478,\n",
       " '1538688389': 479,\n",
       " '372011842': 480,\n",
       " '1797539974': 481,\n",
       " '2322876084': 482,\n",
       " '622462500': 483,\n",
       " '909891171': 484,\n",
       " '517667357': 485,\n",
       " '2309977061': 486,\n",
       " '3145143643': 487,\n",
       " '1667624518': 488,\n",
       " '2549186516': 489,\n",
       " '3273736995': 490,\n",
       " '754435036': 491,\n",
       " '327009161': 492,\n",
       " '1381746236': 493,\n",
       " '37049595': 494,\n",
       " '2223895704': 495,\n",
       " '3272491346': 496,\n",
       " '2964650855': 497,\n",
       " '2402065294': 498,\n",
       " '1385126312': 499,\n",
       " '1065213296': 500,\n",
       " '2742045342': 501,\n",
       " '63412025': 502,\n",
       " '3250536690': 503,\n",
       " '424436456': 504,\n",
       " '1850184598': 505,\n",
       " '4106103231': 506,\n",
       " '1975166347': 507,\n",
       " '119334177': 508,\n",
       " '874026111': 509,\n",
       " '810563397': 510,\n",
       " '3262152241': 511,\n",
       " '3584727515': 512,\n",
       " '3021192688': 513,\n",
       " '149023715': 514,\n",
       " '3175414775': 515,\n",
       " '1977909542': 516,\n",
       " '2447396369': 517,\n",
       " '2795674177': 518,\n",
       " '2812218352': 519,\n",
       " '2490483153': 520,\n",
       " '3482133976': 521,\n",
       " '2560041384': 522,\n",
       " '1773166853': 523,\n",
       " '494878884': 524,\n",
       " '405428132': 525,\n",
       " '631024855': 526,\n",
       " '1301324128': 527,\n",
       " '1334367841': 528,\n",
       " '3502397803': 529,\n",
       " '1319656469': 530,\n",
       " '520657921': 531,\n",
       " '3816082904': 532,\n",
       " '688459086': 533,\n",
       " '2155175647': 534,\n",
       " '1957749285': 535,\n",
       " '2527390511': 536,\n",
       " '1345381188': 537,\n",
       " '4003935165': 538,\n",
       " '456705289': 539,\n",
       " '3935774651': 540,\n",
       " '160567780': 541,\n",
       " '627841864': 542,\n",
       " '1431441098': 543,\n",
       " '2012506795': 544,\n",
       " '1333395072': 545,\n",
       " '2007442218': 546,\n",
       " '3154564127': 547,\n",
       " '940916476': 548,\n",
       " '677713209': 549,\n",
       " '3519205197': 550,\n",
       " '2171647061': 551,\n",
       " '4240253733': 552,\n",
       " '4285933363': 553,\n",
       " '1901024297': 554,\n",
       " '3695530032': 555,\n",
       " '1817434960': 556,\n",
       " '2539912275': 557,\n",
       " '3803058898': 558,\n",
       " '4173617744': 559,\n",
       " '3949816728': 560,\n",
       " '2690882078': 561,\n",
       " '976170144': 562,\n",
       " '3643219070': 563,\n",
       " '3592016808': 564,\n",
       " '1478537984': 565,\n",
       " '874736953': 566,\n",
       " '3034533222': 567,\n",
       " '2438434714': 568,\n",
       " '4281214635': 569,\n",
       " '741577656': 570,\n",
       " '908341071': 571,\n",
       " '1211708085': 572,\n",
       " '2043699274': 573,\n",
       " '3819599403': 574,\n",
       " '45546365': 575,\n",
       " '4156275328': 576,\n",
       " '3565319586': 577,\n",
       " '1057214819': 578,\n",
       " '234679235': 579,\n",
       " '2235166615': 580,\n",
       " '2725806125': 581,\n",
       " '2882404450': 582,\n",
       " '3601943023': 583,\n",
       " '3696273728': 584,\n",
       " '3907154058': 585,\n",
       " '1801766962': 586,\n",
       " '102972049': 587,\n",
       " '1759571467': 588,\n",
       " '3431614259': 589,\n",
       " '3245843888': 590,\n",
       " '872605952': 591,\n",
       " '3566934113': 592,\n",
       " '4279419003': 593,\n",
       " '3478659795': 594,\n",
       " '1311175038': 595,\n",
       " '1296962014': 596,\n",
       " '3051438735': 597,\n",
       " '1019287013': 598,\n",
       " '4077852091': 599,\n",
       " '2997290586': 600,\n",
       " '857528595': 601,\n",
       " '3733069609': 602,\n",
       " '2388719133': 603,\n",
       " '3234626838': 604,\n",
       " '2965673104': 605,\n",
       " '3542981925': 606,\n",
       " '1287319410': 607,\n",
       " '2216327290': 608,\n",
       " '1900974580': 609,\n",
       " '605300738': 610,\n",
       " '99226238': 611,\n",
       " '3016291457': 612,\n",
       " '3624273144': 613,\n",
       " '3532539883': 614,\n",
       " '2173952737': 615,\n",
       " '2459337942': 616,\n",
       " '3548928882': 617,\n",
       " '39798051': 618,\n",
       " '366912974': 619,\n",
       " '3065347634': 620,\n",
       " '3822409571': 621,\n",
       " '1206747600': 622,\n",
       " '41252927': 623,\n",
       " '1052866478': 624,\n",
       " '3757885529': 625,\n",
       " '859794757': 626,\n",
       " '3392045078': 627,\n",
       " '1662125407': 628,\n",
       " '3784929022': 629,\n",
       " '2660106561': 630,\n",
       " '997439313': 631,\n",
       " '1495018044': 632,\n",
       " '3768680031': 633,\n",
       " '619666754': 634,\n",
       " '3359668121': 635,\n",
       " '3311382169': 636,\n",
       " '1107282026': 637,\n",
       " '2033808525': 638,\n",
       " '790991039': 639,\n",
       " '2894128040': 640,\n",
       " '782683695': 641,\n",
       " '3980899956': 642,\n",
       " '3697332645': 643,\n",
       " '150954316': 644,\n",
       " '1570124449': 645,\n",
       " '911059261': 646,\n",
       " '3795400528': 647,\n",
       " '1024326314': 648,\n",
       " '427802891': 649,\n",
       " '3100616465': 650,\n",
       " '518255514': 651,\n",
       " '4116238303': 652,\n",
       " '1433144650': 653,\n",
       " '2227570794': 654,\n",
       " '860542786': 655,\n",
       " '541805002': 656,\n",
       " '2282236511': 657,\n",
       " '4224651969': 658,\n",
       " '944464786': 659,\n",
       " '2749567415': 660,\n",
       " '2746757137': 661,\n",
       " '4031268561': 662,\n",
       " '3894686399': 663,\n",
       " '1698013825': 664,\n",
       " '3569123252': 665,\n",
       " '2306874322': 666,\n",
       " '424727744': 667,\n",
       " '3382083674': 668,\n",
       " '1213571589': 669,\n",
       " '3926115451': 670,\n",
       " '1869298828': 671,\n",
       " '2030036850': 672,\n",
       " '1887085024': 673,\n",
       " '966176695': 674,\n",
       " '1951884947': 675,\n",
       " '1714855985': 676,\n",
       " '1110856401': 677,\n",
       " '2311929369': 678,\n",
       " '2902382716': 679,\n",
       " '3253541195': 680,\n",
       " '1159994451': 681,\n",
       " '478399061': 682,\n",
       " '3631560512': 683,\n",
       " '2006433337': 684,\n",
       " '4171800062': 685,\n",
       " '3105234108': 686,\n",
       " '817091493': 687,\n",
       " '1044976823': 688,\n",
       " '3464109973': 689,\n",
       " '3873074356': 690,\n",
       " '1487572967': 691,\n",
       " '3705342940': 692,\n",
       " '3982673935': 693,\n",
       " '8689602': 694,\n",
       " '1876951633': 695,\n",
       " '4004100709': 696,\n",
       " '3484398328': 697,\n",
       " '1517264651': 698,\n",
       " '4079544335': 699,\n",
       " '1356197621': 700,\n",
       " '825378558': 701,\n",
       " '1036961928': 702,\n",
       " '2517768885': 703,\n",
       " '3695348432': 704,\n",
       " '1781691537': 705,\n",
       " '2567519500': 706,\n",
       " '3538627205': 707,\n",
       " '3523548061': 708,\n",
       " '3764710330': 709,\n",
       " '2596820298': 710,\n",
       " '3412735820': 711,\n",
       " '406835364': 712,\n",
       " '2541362268': 713,\n",
       " '259138075': 714,\n",
       " '1438024794': 715,\n",
       " '4125754193': 716,\n",
       " '1707209495': 717,\n",
       " '2433217376': 718,\n",
       " '2099766081': 719,\n",
       " '1365361942': 720,\n",
       " '4193798139': 721,\n",
       " '1855529308': 722,\n",
       " '465657177': 723,\n",
       " '1718413929': 724,\n",
       " '1550429882': 725,\n",
       " '3118191184': 726,\n",
       " '820270984': 727,\n",
       " '1537309411': 728,\n",
       " '691970985': 729,\n",
       " '61104529': 730,\n",
       " '2627817703': 731,\n",
       " '2118802492': 732,\n",
       " '1181196255': 733,\n",
       " '1002527123': 734,\n",
       " '3913698961': 735,\n",
       " '3086023208': 736,\n",
       " '485940642': 737,\n",
       " '2669889830': 738,\n",
       " '724467061': 739,\n",
       " '2957317168': 740,\n",
       " '1958801669': 741,\n",
       " '3657281130': 742,\n",
       " '3895533864': 743,\n",
       " '3785806105': 744,\n",
       " '3634650435': 745,\n",
       " '1650425619': 746,\n",
       " '949540452': 747,\n",
       " '1283564583': 748,\n",
       " '495818697': 749,\n",
       " '2107626799': 750,\n",
       " '1953210268': 751,\n",
       " '2118381839': 752,\n",
       " '877025255': 753,\n",
       " '1885776015': 754,\n",
       " '4235989936': 755,\n",
       " '1770070953': 756,\n",
       " '82114475': 757,\n",
       " '3841696821': 758,\n",
       " '3677852886': 759,\n",
       " '1045903606': 760,\n",
       " '2193202466': 761,\n",
       " '1853025775': 762,\n",
       " '1112861675': 763,\n",
       " '2976972203': 764,\n",
       " '2373658545': 765,\n",
       " '3626895603': 766,\n",
       " '2081508008': 767,\n",
       " '2933182879': 768,\n",
       " '1215610011': 769,\n",
       " '352549477': 770,\n",
       " '1430932461': 771,\n",
       " '776090102': 772,\n",
       " '1621860415': 773,\n",
       " '3081148851': 774,\n",
       " '2974043284': 775,\n",
       " '3176591677': 776,\n",
       " '1458609238': 777,\n",
       " '3612574786': 778,\n",
       " '3002725183': 779,\n",
       " '2292989534': 780,\n",
       " '1629994289': 781,\n",
       " '4013344838': 782,\n",
       " '4172382349': 783,\n",
       " '3976565573': 784,\n",
       " '238130760': 785,\n",
       " '2760908744': 786,\n",
       " '3066409179': 787,\n",
       " '3526449172': 788,\n",
       " '2639426191': 789,\n",
       " '2953070154': 790,\n",
       " '1623007742': 791,\n",
       " '3900421971': 792,\n",
       " '3231087603': 793,\n",
       " '3306999925': 794,\n",
       " '1601859970': 795,\n",
       " '705007381': 796,\n",
       " '505464566': 797,\n",
       " '321356225': 798,\n",
       " '4119475665': 799,\n",
       " '4070503980': 800,\n",
       " '783438771': 801,\n",
       " '1783683714': 802,\n",
       " '3831365201': 803,\n",
       " '3520307709': 804,\n",
       " '3860304218': 805,\n",
       " '533528858': 806,\n",
       " '3026366413': 807,\n",
       " '299923852': 808,\n",
       " '2131593904': 809,\n",
       " '3869122811': 810,\n",
       " '2177332500': 811,\n",
       " '2663748542': 812,\n",
       " '2809607334': 813,\n",
       " '2530713176': 814,\n",
       " '2120836099': 815,\n",
       " '2915017543': 816,\n",
       " '1218703283': 817,\n",
       " '4036898534': 818,\n",
       " '2612400653': 819,\n",
       " '3175383828': 820,\n",
       " '2413298858': 821,\n",
       " '78990001': 822,\n",
       " '1515297223': 823,\n",
       " '419532466': 824,\n",
       " '3191079645': 825,\n",
       " '2259216780': 826,\n",
       " '856365940': 827,\n",
       " '817533556': 828,\n",
       " '2935869426': 829,\n",
       " '604122973': 830,\n",
       " '4055258235': 831,\n",
       " '2724829651': 832,\n",
       " '1510189693': 833,\n",
       " '3372530401': 834,\n",
       " '10609218': 835,\n",
       " '3029154892': 836,\n",
       " '1929622843': 837,\n",
       " '183488039': 838,\n",
       " '3900687073': 839,\n",
       " '1050485549': 840,\n",
       " '446790947': 841,\n",
       " '820281725': 842,\n",
       " '3816177993': 843,\n",
       " '3469133902': 844,\n",
       " '3277893314': 845,\n",
       " '2839246665': 846,\n",
       " '1102483880': 847,\n",
       " '3127514099': 848,\n",
       " '2604206550': 849,\n",
       " '2009815686': 850,\n",
       " '3388395910': 851,\n",
       " '1943255162': 852,\n",
       " '3429508261': 853,\n",
       " '730958187': 854,\n",
       " '477854779': 855,\n",
       " '934841988': 856,\n",
       " '711611185': 857,\n",
       " '1826801459': 858,\n",
       " '566999193': 859,\n",
       " '1750473893': 860,\n",
       " '16336968': 861,\n",
       " '104363229': 862,\n",
       " '3999157860': 863,\n",
       " '2114066448': 864,\n",
       " '1829117208': 865,\n",
       " '3134728841': 866,\n",
       " '100417525': 867,\n",
       " '561449801': 868,\n",
       " '335083853': 869,\n",
       " '472751320': 870,\n",
       " '284203489': 871,\n",
       " '40927922': 872,\n",
       " '2529252158': 873,\n",
       " '3664010858': 874,\n",
       " '1013377163': 875,\n",
       " '4181724338': 876,\n",
       " '3009383497': 877,\n",
       " '931545300': 878,\n",
       " '4290944875': 879,\n",
       " '1117683378': 880,\n",
       " '200617202': 881,\n",
       " '153997026': 882,\n",
       " '3771678348': 883,\n",
       " '1275822727': 884,\n",
       " '3126566992': 885,\n",
       " '374048503': 886,\n",
       " '814953804': 887,\n",
       " '1744782555': 888,\n",
       " '412401632': 889,\n",
       " '3603231279': 890,\n",
       " '258625053': 891,\n",
       " '1798745078': 892,\n",
       " '1383961059': 893,\n",
       " '668426704': 894,\n",
       " '3769856229': 895,\n",
       " '4239674720': 896,\n",
       " '1532857056': 897,\n",
       " '322219501': 898,\n",
       " '827652101': 899,\n",
       " '3302915994': 900,\n",
       " '4072954907': 901,\n",
       " '2639332741': 902,\n",
       " '3801659300': 903,\n",
       " '3915228859': 904,\n",
       " '440098296': 905,\n",
       " '1940265937': 906,\n",
       " '4128714692': 907,\n",
       " '3698756944': 908,\n",
       " '3508322706': 909,\n",
       " '287790200': 910,\n",
       " '137279329': 911,\n",
       " '2631498914': 912,\n",
       " '3896920659': 913,\n",
       " '3470999302': 914,\n",
       " '1480770482': 915,\n",
       " '4090120152': 916,\n",
       " '982352471': 917,\n",
       " '992455200': 918,\n",
       " '3906525090': 919,\n",
       " '1187251277': 920,\n",
       " '3322294869': 921,\n",
       " '3836582687': 922,\n",
       " '870899349': 923,\n",
       " '1428413928': 924,\n",
       " '2180353936': 925,\n",
       " '1442729750': 926,\n",
       " '2876585126': 927,\n",
       " '1008778548': 928,\n",
       " '1633278155': 929,\n",
       " '3732064146': 930,\n",
       " '1436270591': 931,\n",
       " '1971604001': 932,\n",
       " '1077729842': 933,\n",
       " '3909691421': 934,\n",
       " '1557270795': 935,\n",
       " '367125363': 936,\n",
       " '3298251587': 937,\n",
       " '1732259589': 938,\n",
       " '519705421': 939,\n",
       " '329123657': 940,\n",
       " '250211214': 941,\n",
       " '1734223694': 942,\n",
       " '2020109352': 943,\n",
       " '1516820536': 944,\n",
       " '130281276': 945,\n",
       " '3364485916': 946,\n",
       " '3374990111': 947,\n",
       " '3174369333': 948,\n",
       " '4152301247': 949,\n",
       " '2962436572': 950,\n",
       " '124311055': 951,\n",
       " '2030801280': 952,\n",
       " '1988626107': 953,\n",
       " '815715465': 954,\n",
       " '429380726': 955,\n",
       " '1364538695': 956,\n",
       " '3553843473': 957,\n",
       " '401222554': 958,\n",
       " '4150025051': 959,\n",
       " '1973140471': 960,\n",
       " '2433903660': 961,\n",
       " '2850509209': 962,\n",
       " '1415883658': 963,\n",
       " '3124721005': 964,\n",
       " '1721071411': 965,\n",
       " '2385338568': 966,\n",
       " '4234138929': 967,\n",
       " '3057595260': 968,\n",
       " '2583046456': 969,\n",
       " '3290701166': 970,\n",
       " '3907799000': 971,\n",
       " '2150760652': 972,\n",
       " '2586592033': 973,\n",
       " '2021865435': 974,\n",
       " '3921048124': 975,\n",
       " '835154923': 976,\n",
       " '3355058483': 977,\n",
       " '2139975650': 978,\n",
       " '1658604508': 979,\n",
       " '2078281088': 980,\n",
       " '2705317682': 981,\n",
       " '3968521389': 982,\n",
       " '968240954': 983,\n",
       " '1247327810': 984,\n",
       " '1209963424': 985,\n",
       " '573627113': 986,\n",
       " '4211536449': 987,\n",
       " '758156246': 988,\n",
       " '1739587848': 989,\n",
       " '2950866342': 990,\n",
       " '421586005': 991,\n",
       " '563069256': 992,\n",
       " '3102156677': 993,\n",
       " '475538891': 994,\n",
       " '1011764733': 995,\n",
       " '361570611': 996,\n",
       " '4211398584': 997,\n",
       " '121419074': 998,\n",
       " '1427701461': 999,\n",
       " ...}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eventIndex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "FE = FeatureEng()\n",
    "\n",
    "fin = open(\"./data/events.csv\", 'r')\n",
    "\n",
    "#字段：event_id, user_id,start_time, city, state, zip, country, lat, and lng， 101 columns of words count\n",
    "fin.readline() # skip header\n",
    "\n",
    "#start_time, city, state, zip, country, lat, and lng\n",
    "eventPropMatrix = ss.dok_matrix((n_events, 7))\n",
    "\n",
    "#词频特征\n",
    "eventContMatrix = ss.dok_matrix((n_events, 101))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "for line in fin.readlines():\n",
    "    cols = line.strip().split(\",\")\n",
    "    eventId = str(cols[0])\n",
    "    \n",
    "    if eventId in eventIndex:  #在训练集或测试集中出现\n",
    "        i = eventIndex[eventId]\n",
    "  \n",
    "        #event的特征编码，这里只是简单处理，其实开始时间，地点等信息很重要\n",
    "        eventPropMatrix[i, 0] = FE.getJoinedYearMonth(cols[2]) # start_time\n",
    "        eventPropMatrix[i, 1] = FE.getFeatureHash(cols[3]) # city\n",
    "        eventPropMatrix[i, 2] = FE.getFeatureHash(cols[4]) # state\n",
    "        eventPropMatrix[i, 3] = FE.getFeatureHash(cols[5]) # zip\n",
    "        eventPropMatrix[i, 4] = FE.getFeatureHash(cols[6]) # country\n",
    "        eventPropMatrix[i, 5] = FE.getFloatValue(cols[7]) # lat\n",
    "        eventPropMatrix[i, 6] = FE.getFloatValue(cols[8]) # lon\n",
    "        \n",
    "        #词频\n",
    "        for j in range(9, 110):\n",
    "            eventContMatrix[i, j-9] = cols[j]\n",
    "fin.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<13418x7 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 0 stored elements in Dictionary Of Keys format>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eventPropMatrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eventContMatrix[0, 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#用L2模归一化\n",
    "eventPropMatrix = normalize(eventPropMatrix,\n",
    "    norm=\"l2\", axis=0, copy=False)\n",
    "sio.mmwrite(\"EV_eventPropMatrix\", eventPropMatrix)\n",
    "\n",
    "#词频，可以考虑我们用这部分特征进行聚类，得到活动的genre\n",
    "eventContMatrix = normalize(eventContMatrix,\n",
    "    norm=\"l2\", axis=0, copy=False)\n",
    "sio.mmwrite(\"EV_eventContMatrix\", eventContMatrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "uniqueEventPairs = _pickle.load(open(\"./data/PE_uniqueEventPairs.pkl\", 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Anaconda3\\lib\\site-packages\\scipy\\spatial\\distance.py:543: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  dist = 1.0 - np.dot(um, vm) / (norm(um) * norm(vm))\n",
      "C:\\Anaconda3\\lib\\site-packages\\scipy\\spatial\\distance.py:505: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))\n"
     ]
    }
   ],
   "source": [
    "eventPropSim = ss.dok_matrix((n_events, n_events))\n",
    "eventContSim = ss.dok_matrix((n_events, n_events))\n",
    "\n",
    "for e1, e2 in uniqueEventPairs:\n",
    "    i = e1\n",
    "    j = e2\n",
    "    \n",
    "    #非词频特征，采用Person相关系数作为相似度\n",
    "    if (i,j) not in eventPropSim:\n",
    "        epsim = ssd.correlation(eventPropMatrix.getrow(i).todense(),\n",
    "            eventPropMatrix.getrow(j).todense())\n",
    "        \n",
    "        eventPropSim[i, j] = epsim\n",
    "        eventPropSim[j, i] = epsim\n",
    "    \n",
    "    #对词频特征，采用余弦相似度，也可以用直方图交/Jacard相似度\n",
    "    if (i,j) not in eventContSim:\n",
    "        ecsim = ssd.cosine(eventContMatrix.getrow(i).todense(),\n",
    "            eventContMatrix.getrow(j).todense())\n",
    "    \n",
    "        eventContSim[i, j] = epsim\n",
    "        eventContSim[j, i] = epsim\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
